import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
There are three CSV files that I am using. First is a coronavirus dataset that contains data for the different types of cases (confirmed, death, and recovered) for different countries recorded daily since the day of the outbreak. The second contains the data for the number of vaccine doses administered, number of people partially and fully vaccinated, daily. The third dataset, that I have received from the public source but not used, unfortunately, is the world population dataset, which contains the population for all countries.
These CSV files are created by Johns Hopkins University Centre for Systems Science and Engineering and they are available in a GitHub Public Repository, linked below. This data has been compiled from sources like the World Health Organization (WHO), the Centres for Disease Control and Prevention (CDC), and the Ministry of Health from multiple countries.
Dataset link: https://github.com/RamiKrispin/coronavirus These datasets can be considered a Big Data dataset because they fulfil all the Vs of Big Data.
Velocity: These datasets are updated daily and are available on a public GitHub repository.
Volume: These available datasets take more than 50GB of storage which includes text, numbers, and date series.In this assignment, I will be using four different CSV files: Cumulative Confirmed Case, Recovered Case, Death case, and stats data. Each File contains, country name, Province/State, latitude, Longitude, and Date series (Confirmed /Recovered/ Death respectively) from 22/JAN/2020 to till date (Updated on daily basis)
Variety: The dataset is updated from different sources. Refer: Dataset Link
virus=pd.read_csv("coronavirus.csv")
vaccine=pd.read_csv("covid19_vaccine.csv")
pop=pd.read_csv("world_population.csv")
The virus and vaccine datasets are huge. They contain features and values that need processing. For example, the virus dataset has a column of ‘type’, that has three unique values, confirmed, death, and recovered. I used the Pandas and the NumPy libraries to manipulate, clean, and integrate both datasets.
Virus dataset: I check for the number of unique countries by using the nunique() function. As there were a lot of unique countries, I decided to drop the column, along with the province, lat, and long columns, using the drop() function. Next, I created three different datasets, for the three different types of cases, and divide the data of the main dataset into the three datasets using Boolean masking. Next, I used the groupby() function to sum the total cases for each date, so that we have the total number of cases for each unique date.
virus['date']=pd.to_datetime(virus['date'])
print(virus['country'].unique())
print(virus['type'].unique())
['Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia' 'New Zealand' 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon' 'Central African Republic' 'Chad' 'Chile' 'Colombia' 'Comoros' 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess' 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji' 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See' 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kenya' 'Kiribati' 'Korea, South' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia' 'Lebanon' 'Lesotho' 'Liberia' 'Libya' 'Liechtenstein' 'Lithuania' 'Luxembourg' 'Madagascar' 'Malawi' 'Malaysia' 'Maldives' 'Mali' 'Malta' 'Marshall Islands' 'Mauritania' 'Mauritius' 'Mexico' 'Micronesia' 'Moldova' 'Monaco' 'Mongolia' 'Montenegro' 'Morocco' 'Mozambique' 'MS Zaandam' 'Namibia' 'Nepal' 'Nicaragua' 'Niger' 'Nigeria' 'North Macedonia' 'Norway' 'Oman' 'Pakistan' 'Palau' 'Panama' 'Papua New Guinea' 'Paraguay' 'Peru' 'Philippines' 'Poland' 'Portugal' 'Qatar' 'Romania' 'Russia' 'Rwanda' 'Saint Kitts and Nevis' 'Saint Lucia' 'Saint Vincent and the Grenadines' 'Samoa' 'San Marino' 'Sao Tome and Principe' 'Saudi Arabia' 'Senegal' 'Serbia' 'Seychelles' 'Sierra Leone' 'Singapore' 'Slovakia' 'Slovenia' 'Solomon Islands' 'Somalia' 'South Africa' 'South Sudan' 'Spain' 'Sri Lanka' 'Sudan' 'Summer Olympics 2020' 'Suriname' 'Sweden' 'Switzerland' 'Syria' 'Taiwan*' 'Tajikistan' 'Tanzania' 'Thailand' 'Timor-Leste' 'Togo' 'Trinidad and Tobago' 'Tunisia' 'Turkey' 'Uganda' 'Ukraine' 'United Arab Emirates' 'Uruguay' 'US' 'Uzbekistan' 'Vanuatu' 'Venezuela' 'Vietnam' 'West Bank and Gaza' 'Yemen' 'Zambia' 'Zimbabwe'] ['confirmed' 'death' 'recovered']
#Using the nunique() function to get the bumber of unique countries in the dataset
virus['country'].nunique()
195
print(virus.nunique())
date 630 province 87 country 195 lat 278 long 279 type 3 cases 15234 dtype: int64
virus['type']=virus['type'].map({'confirmed':0, 'death':-1, 'recovered':1})
Vaccine Dataset: I followed a similar approach of dropping unnecessary features, creating seven different data frames for seven different continents, and using Boolean masking to transfer data from the main dataset to the seven different data frames. Here, I used a different function, to sum up, the three different vaccine features for each unique date, the pivot_table() function.
vaccine=vaccine.drop(['uid','province_state','iso2','iso3'], axis=1)
vaccine=vaccine.drop(['code3','fips','combined_key','continent_code'], axis=1)
vaccine=vaccine.dropna()
vaccine.nunique()
country_region 157 date 303 doses_admin 21110 people_partially_vaccinated 19108 people_fully_vaccinated 16784 report_date_string 303 lat 157 long 157 population 157 continent_name 6 dtype: int64
print(vaccine['continent_name'].unique())
['Asia' 'Europe' 'Africa' 'North America' 'South America' 'Oceania']
pop=pop.drop(['Indicator Name', 'Indicator Code'], axis=1)
pop['Country Name'].nunique()
266
We need to pre process the data, such as cleaning it and grouping and merging the data as per our needs to extract insights.
We will use the Coronavirus data to create an Interactive Graph for Confirmed Cases, Death Cases, Recovered Cases and Active Cases. These four types will be in form of buttons as a choice. The form will be a scatter plot.
# We need to get the number of unique countries in every dataset to understand the scenario
print('No. of unique countries in the virus dataset:',virus['country'].nunique())
print('No. of unique countries in the Vaccine dataset:',vaccine['country_region'].nunique())
print('No. of unique countries in the World popultaion dataset:',pop['Country Name'].nunique())
No. of unique countries in the virus dataset: 195 No. of unique countries in the Vaccine dataset: 157 No. of unique countries in the World popultaion dataset: 266
As there are different number of unique countries in different datasets, we will need to do a lot of pre processing with all the data sets to make sure the visualations are accuarate
Lets start with the Virus Dataset!!
# We will drop all the unnecessary columns that we dont need.
virus=virus.drop(['province', 'lat', 'long'], axis=1)
virus=virus.sort_values(by='country', ascending=True)
virus=virus.set_index('country')
virus=virus.reset_index()
virus=virus.dropna()
# We will sort the values of the table in ascending order by the 'date' feature
virus=virus.sort_values(by='date', ascending=True)
#Now we will print out the earliest and the latest date in the dataset
print(virus['date'].min())
print(virus['date'].max())
2020-01-22 00:00:00 2021-10-12 00:00:00
# Creating the new dataframes.
confirmed_cases=pd.DataFrame(columns=['country', 'date', 'type', 'cases'])
death_cases=pd.DataFrame(columns=['country', 'date', 'type', 'cases'])
recovered_cases=pd.DataFrame(columns=['country', 'date', 'type', 'cases'])
We will use the Boolean Masking Mechanism to Transfer data from the Main dataset into the 3 different datasets
# Transfering Data to the Confirmed Cases Dataset
confirmed_cases['country']=virus['country'].where(virus['type']==0)
confirmed_cases['date']=virus['date'].where(virus['type']==0)
confirmed_cases['type']=virus['type'].where(virus['type']==0)
confirmed_cases['cases']=virus['cases'].where(virus['type']==0)
confirmed_cases=confirmed_cases.dropna()
# Transfering Data to the Death Cases Dataset
death_cases['country']=virus['country'].where(virus['type']==-1)
death_cases['date']=virus['date'].where(virus['type']==-1)
death_cases['type']=virus['type'].where(virus['type']==-1)
death_cases['cases']=virus['cases'].where(virus['type']==-1)
death_cases=death_cases.dropna()
# # Transfering Data to the Recovered Cases Dataset
recovered_cases['country']=virus['country'].where(virus['type']==1)
recovered_cases['date']=virus['date'].where(virus['type']==1)
recovered_cases['type']=virus['type'].where(virus['type']==1)
recovered_cases['cases']=virus['cases'].where(virus['type']==1)
recovered_cases=recovered_cases.dropna()
# We Verify whether all data is seperated properly.
print('The total number of confirmed cases are:',confirmed_cases['cases'].sum())
print('The total number of deaths are:',death_cases['cases'].sum())
print('The total number of recovered cases are:',recovered_cases['cases'].sum())
total=confirmed_cases['cases'].sum()+death_cases['cases'].sum()+recovered_cases['cases'].sum()
print('The total number of cases are:', total)
The total number of confirmed cases are: 238704757.0 The total number of deaths are: 4865619.0 The total number of recovered cases are: 130899061.0 The total number of cases are: 374469437.0
We will use the groupby() function to group the date column with cases to calculate the sum of cases for a specific date, so that we dont we have multiple values for 1 date. As by using the groupby() function converts the dataset into a series, we will use the to_frame() function to reconvert it into a series.
#Summing up cases for Confirmed Cases
confirmed_cases=confirmed_cases.drop('country',axis=1)
confirmed_cases=confirmed_cases.groupby('date')['cases'].sum()
confirmed_cases=confirmed_cases.to_frame()
confirmed_cases=confirmed_cases.reset_index()
#Summing up cases for Confirmed Cases
death_cases=death_cases.drop('country',axis=1)
death_cases=death_cases.groupby('date')['cases'].sum()
death_cases=death_cases.to_frame()
death_cases=death_cases.reset_index()
#Summing up cases for Confirmed Cases
recovered_cases=recovered_cases.drop('country',axis=1)
recovered_cases=recovered_cases.groupby('date')['cases'].sum()
recovered_cases=recovered_cases.to_frame()
recovered_cases=recovered_cases.reset_index()
We create a new dataset, Active in which we calculate the active cases by subtracting the sum of recovered and death cases from the Confirmed cases
#We create the active cases dataset.
active_cases=pd.DataFrame(columns=['date','cases'])
active_cases['date']=confirmed_cases['date']
active_cases['cases']=confirmed_cases['cases']-(death_cases['cases']+recovered_cases['cases'])
active_cases = active_cases.replace(np.nan, '', regex=True)
# There are some negative values in the recovered dataset, that we replace with zero.
recovered_cases['cases'][recovered_cases['cases']<0]=0
<ipython-input-23-0c567c212d24>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy recovered_cases['cases'][recovered_cases['cases']<0]=0
For the Virus dataset, I wanted to simplify the visualization for the audience, so that they can choose what they want to see. There are three types of cases in the virus dataset-confirmed, death, and recovered, the fourth one, active cases, I created by subtracting the sum of recovered and death cases from confirmed cases.
The choice of my graph for this dataset is scatter plots. I chose to scatter plots because these graphs represent the number of daily cases worldwide. As the markers in the scatter plots get spread all over the window, it can provide a better understanding of how dense or how narrow the situation is.
As the increase in the number of cases is not a good sign, to show the severity of the situation is used the color scale parameter, which uses darker colors for larger numerical values and lighter colors for smaller numerical values. Plotly displays a color scale bar on the right side of the window.
The functions that I have used are detailed below:
add_trace(): comes in plotly package will allow us to trace the details from time to time and "hovertemplate" attribute contains the details of both the axis when hovered.
update_layout(): contains an attribute "updatemenus" which allow us to add any type of menu with additional details. In this visualisation, I have selected "buttons" in "down" direction.
update_xaxes(): updates the x-axis labels, but, I have made "showticklabels=False" which will not display the x-axis labels.
update_layout(): will help to design the background layout to the graph by adding x_label, y_label, titles and so on.
rangeslider() will enable user to desired range for which he wants to see the data
rangeselector() are pre programmed buttons that will enable users to directly select the time period for which they want to see the visualization.
cases=[confirmed_cases, death_cases, recovered_cases, active_cases]
FIG = go.Figure()
for types in cases:
FIG.add_trace(go.Scatter(x=types['date'], y=types['cases'], mode='markers', marker=dict(
size=5,
color=np.random.randn(500), #set color equal to a variable
colorscale='portland', # one of plotly colorscales
showscale=True
), hovertemplate ='<br><b>Date</b>: %{x}'+'<br><i>Cases</i>: %{y}'))
FIG.update_layout(updatemenus=[dict(buttons=list([dict(label = 'Confirmed Cases', method = 'update',args = [{'visible': [True, False,False,False]},{'title': 'Number of Cases recorded daily'}]),
dict(label = 'Death Cases', method = 'update', args = [{'visible': [False,True, False, False]},{'title': 'Number of Daily Deaths'}]),
dict(label = 'Recovered Cases', method = 'update',args = [{'visible': [False,False,True, False]},{'title': 'Number of People Recovered Daily'}]),
dict(label = 'Active Cases', method = 'update', args = [{'visible': [False,False,False,True]},{'title': 'Daily Active Cases'}]),
]),
type = "buttons",direction="down",showactive=True,y=1.5,yanchor="top", bgcolor="#ffcc00")])
FIG.update_xaxes(showticklabels=True, rangeslider_visible=True,rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
))
FIG.update_layout(title_text="Daily statistics for the number of cases", title_x=0.5, title_font_size=20, paper_bgcolor="#ccffcc",
xaxis_title="Date", yaxis_title="Number of Doses")
FIG.show()
vaccine.head()
| country_region | date | doses_admin | people_partially_vaccinated | people_fully_vaccinated | report_date_string | lat | long | population | continent_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2021-02-22 | 0.0 | 0.0 | 0.0 | 2021-02-22 | 33.93911 | 67.709953 | 38928341.0 | Asia |
| 1 | Afghanistan | 2021-02-23 | 0.0 | 0.0 | 0.0 | 2021-02-23 | 33.93911 | 67.709953 | 38928341.0 | Asia |
| 2 | Afghanistan | 2021-02-24 | 0.0 | 0.0 | 0.0 | 2021-02-24 | 33.93911 | 67.709953 | 38928341.0 | Asia |
| 3 | Afghanistan | 2021-02-25 | 0.0 | 0.0 | 0.0 | 2021-02-25 | 33.93911 | 67.709953 | 38928341.0 | Asia |
| 4 | Afghanistan | 2021-02-26 | 0.0 | 0.0 | 0.0 | 2021-02-26 | 33.93911 | 67.709953 | 38928341.0 | Asia |
vaccine['date']=pd.to_datetime(vaccine['date'])
vaccine['report_date_string']=pd.to_datetime(vaccine['report_date_string'])
vaccine=vaccine.drop(['lat','long'],axis=1)
vaccine=vaccine.set_index('continent_name')
vaccine=vaccine.reset_index()
print(vaccine['continent_name'].unique())
print(vaccine['continent_name'].nunique())
vaccine.head()
['Asia' 'Europe' 'Africa' 'North America' 'South America' 'Oceania'] 6
| continent_name | country_region | date | doses_admin | people_partially_vaccinated | people_fully_vaccinated | report_date_string | population | |
|---|---|---|---|---|---|---|---|---|
| 0 | Asia | Afghanistan | 2021-02-22 | 0.0 | 0.0 | 0.0 | 2021-02-22 | 38928341.0 |
| 1 | Asia | Afghanistan | 2021-02-23 | 0.0 | 0.0 | 0.0 | 2021-02-23 | 38928341.0 |
| 2 | Asia | Afghanistan | 2021-02-24 | 0.0 | 0.0 | 0.0 | 2021-02-24 | 38928341.0 |
| 3 | Asia | Afghanistan | 2021-02-25 | 0.0 | 0.0 | 0.0 | 2021-02-25 | 38928341.0 |
| 4 | Asia | Afghanistan | 2021-02-26 | 0.0 | 0.0 | 0.0 | 2021-02-26 | 38928341.0 |
We start by creating 7 different dataframes for the continents, then we transfer the data from the main dataset to these datasets by using boolean masking. Then we will sum up cases for specific dates using pivot tables function.
#Creating the different datasets.
Asia=pd.DataFrame(columns=['date','doses_admin','people_partially_vaccinated','people_fully_vaccinated','country_region'])
Europe=pd.DataFrame(columns=['date','doses_admin','people_partially_vaccinated','people_fully_vaccinated','country_region'])
Africa=pd.DataFrame(columns=['date','doses_admin','people_partially_vaccinated','people_fully_vaccinated','country_region'])
Namerica=pd.DataFrame(columns=['date','doses_admin','people_partially_vaccinated','people_fully_vaccinated','country_region'])
Samerica=pd.DataFrame(columns=['date','doses_admin','people_partially_vaccinated','people_fully_vaccinated','country_region'])
Oceania=pd.DataFrame(columns=['date','doses_admin','people_partially_vaccinated','people_fully_vaccinated','country_region'])
I followed a similar approach of dropping unnecessary features, creating seven different data frames for seven different continents, and using Boolean masking to transfer data from the main dataset to the seven different data frames. Here, I used a different function, to sum up, the three different vaccine features for each unique date, the pivot_table() function
Asia['date']=vaccine['date'].where(vaccine['continent_name']=='Asia')
Asia['doses_admin']=vaccine['doses_admin'].where(vaccine['continent_name']=='Asia')
Asia['people_partially_vaccinated']=vaccine['people_partially_vaccinated'].where(vaccine['continent_name']=='Asia')
Asia['people_fully_vaccinated']=vaccine['people_fully_vaccinated'].where(vaccine['continent_name']=='Asia')
Asia['country_region']=vaccine['country_region'].where(vaccine['continent_name']=='Asia')
Asia=Asia.dropna()
Asia['doses_admin']=Asia['doses_admin']/1000
Asia['people_partially_vaccinated']=Asia['people_partially_vaccinated']/1000
Asia['people_fully_vaccinated']=Asia['people_fully_vaccinated']/1000
Asia=pd.pivot_table(Asia, index=['date'],values=['doses_admin','people_partially_vaccinated','people_fully_vaccinated'],aggfunc=np.sum)
Asia=Asia.reset_index()
Europe['date']=vaccine['date'].where(vaccine['continent_name']=='Europe')
Europe['doses_admin']=vaccine['doses_admin'].where(vaccine['continent_name']=='Europe')
Europe['people_partially_vaccinated']=vaccine['people_partially_vaccinated'].where(vaccine['continent_name']=='Europe')
Europe['people_fully_vaccinated']=vaccine['people_fully_vaccinated'].where(vaccine['continent_name']=='Europe')
Europe['country_region']=vaccine['country_region'].where(vaccine['continent_name']=='Europe')
Europe=Europe.dropna()
Europe['doses_admin']=Europe['doses_admin']/1000
Europe['people_partially_vaccinated']=Europe['people_partially_vaccinated']/1000
Europe['people_fully_vaccinated']=Europe['people_fully_vaccinated']/1000
Europe=pd.pivot_table(Europe, index=['date'],values=['doses_admin','people_partially_vaccinated','people_fully_vaccinated'],aggfunc=np.sum)
Europe=Europe.reset_index()
Africa['date']=vaccine['date'].where(vaccine['continent_name']=='Africa')
Africa['doses_admin']=vaccine['doses_admin'].where(vaccine['continent_name']=='Africa')
Africa['people_partially_vaccinated']=vaccine['people_partially_vaccinated'].where(vaccine['continent_name']=='Africa')
Africa['people_fully_vaccinated']=vaccine['people_fully_vaccinated'].where(vaccine['continent_name']=='Africa')
Africa['country_region']=vaccine['country_region'].where(vaccine['continent_name']=='Africa')
Africa=Africa.dropna()
Africa['doses_admin']=Africa['doses_admin']/1000
Africa['people_partially_vaccinated']=Africa['people_partially_vaccinated']/1000
Africa['people_fully_vaccinated']=Africa['people_fully_vaccinated']/1000
Africa=pd.pivot_table(Africa, index=['date'],values=['doses_admin','people_partially_vaccinated','people_fully_vaccinated'],aggfunc=np.sum)
Africa=Africa.reset_index()
Oceania['date']=vaccine['date'].where(vaccine['continent_name']=='Oceania')
Oceania['doses_admin']=vaccine['doses_admin'].where(vaccine['continent_name']=='Oceania')
Oceania['people_partially_vaccinated']=vaccine['people_partially_vaccinated'].where(vaccine['continent_name']=='Oceania')
Oceania['people_fully_vaccinated']=vaccine['people_fully_vaccinated'].where(vaccine['continent_name']=='Oceania')
Oceania['country_region']=vaccine['country_region'].where(vaccine['continent_name']=='Oceania')
Oceania=Oceania.dropna()
Oceania['doses_admin']=Oceania['doses_admin']/1000
Oceania['people_partially_vaccinated']=Oceania['people_partially_vaccinated']/1000
Oceania['people_fully_vaccinated']=Oceania['people_fully_vaccinated']/1000
Oceania=pd.pivot_table(Oceania, index=['date'],values=['doses_admin','people_partially_vaccinated','people_fully_vaccinated'],aggfunc=np.sum)
Oceania=Oceania.reset_index()
Namerica['date']=vaccine['date'].where(vaccine['continent_name']=='North America')
Namerica['doses_admin']=vaccine['doses_admin'].where(vaccine['continent_name']=='North America')
Namerica['people_partially_vaccinated']=vaccine['people_partially_vaccinated'].where(vaccine['continent_name']=='North America')
Namerica['people_fully_vaccinated']=vaccine['people_fully_vaccinated'].where(vaccine['continent_name']=='North America')
Namerica['country_region']=vaccine['country_region'].where(vaccine['continent_name']=='North America')
Namerica=Namerica.dropna()
Namerica['doses_admin']=Namerica['doses_admin']/1000
Namerica['people_partially_vaccinated']=Namerica['people_partially_vaccinated']/1000
Namerica['people_fully_vaccinated']=Namerica['people_fully_vaccinated']/1000
Namerica=pd.pivot_table(Namerica, index=['date'],values=['doses_admin','people_partially_vaccinated','people_fully_vaccinated'],aggfunc=np.sum)
Namerica=Namerica.reset_index()
Samerica['date']=vaccine['date'].where(vaccine['continent_name']=='South America')
Samerica['doses_admin']=vaccine['doses_admin'].where(vaccine['continent_name']=='South America')
Samerica['people_partially_vaccinated']=vaccine['people_partially_vaccinated'].where(vaccine['continent_name']=='South America')
Samerica['people_fully_vaccinated']=vaccine['people_fully_vaccinated'].where(vaccine['continent_name']=='South America')
Samerica['country_region']=vaccine['country_region'].where(vaccine['continent_name']=='South America')
Samerica=Samerica.dropna()
Samerica['doses_admin']=Samerica['doses_admin']/1000
Samerica['people_partially_vaccinated']=Samerica['people_partially_vaccinated']/1000
Samerica['people_fully_vaccinated']=Samerica['people_fully_vaccinated']/1000
Samerica=pd.pivot_table(Samerica, index=['date'],values=['doses_admin','people_partially_vaccinated','people_fully_vaccinated'],aggfunc=np.sum)
Samerica=Samerica.reset_index()
# We sort the tables
Asia=Asia.sort_values(by='date',ascending=True)
Africa=Africa.sort_values(by='date',ascending=True)
Oceania=Oceania.sort_values(by='date',ascending=True)
Europe=Europe.sort_values(by='date',ascending=True)
Namerica=Namerica.sort_values(by='date',ascending=True)
Samerica=Samerica.sort_values(by='date',ascending=True)
vac_countries=[Asia,Africa,Oceania,Europe,Namerica,Samerica]
Samerica.head()
| date | doses_admin | people_fully_vaccinated | people_partially_vaccinated | |
|---|---|---|---|---|
| 0 | 2020-12-24 | 0.420 | 0.0 | 0.420 |
| 1 | 2020-12-25 | 5.198 | 0.0 | 5.198 |
| 2 | 2020-12-26 | 8.338 | 0.0 | 8.338 |
| 3 | 2020-12-27 | 8.649 | 0.0 | 8.649 |
| 4 | 2020-12-28 | 8.649 | 0.0 | 8.649 |
For the Vaccine dataset, I wanted to do something new with choices that I offer to the customer. On many COVID-19 related websites, there is a list of the number of countries from which the customers choose for which country he/she wants to view the virus statistics for. Also, there are not that many websites, that are displaying the global vaccination statistics. Most of the customers who will view the statistics, don’t want in-depth information for every country. I have cleaned the dataset to display information not for 224 countries, but only for 7 continents.
My choice for these graphs is stacked bar charts. I have chosen a stacked bar chart for each continent because I am displaying three features for the vaccination status- the number of doses administered, the number of people partially vaccinated, and the number of people fully vaccinated. Similar to the previous graphs, I have used the plotly library for these graphs, as it has a lot of options for colorcoding, provides mechanisms for hover-effects, and provides interactivity mechanisms for the graphs. For interactivity, I have created 7 buttons, for 7 continents, that will display the stacked bar charts for the continent of which the button the customer clicks.
I have used the built-in color scales in the Plotly library to show the severity of the situation, as the number of cases, and the number of vaccine cases increase, the colors get darker.
The functions that I have used are detailed below:
add_trace(): comes in plotly package will allow us to trace the details from time to time and "hovertemplate" attribute contains the details of both the axis when hovered.
update_layout(): contains an attribute "updatemenus" which allow us to add any type of menu with additional details. In this visualisation, I have selected "buttons" in "down" direction.
update_xaxes(): updates the x-axis labels, but, I have made "showticklabels=False" which will not display the x-axis labels.
update_layout(): will help to design the background layout to the graph by adding x_label, y_label, titles and so on.
rangeslider() will enable user to desired range for which he wants to see the data
rangeselector() are pre programmed buttons that will enable users to directly select the time period for which they want to see the visualization.
fig = go.Figure()
for continent in vac_countries:
fig.add_trace(go.Bar(name='Doses', x=continent['date'], y=continent['doses_admin'], marker=dict(
color=np.random.randn(500), #set color equal to a variable
colorscale='greens', # one of plotly colorscales
),hovertemplate ='<br><b>Date</b>: %{x}'+'<br><i>Doses</i>: %{y}'))
for continent in vac_countries:
fig.add_trace(go.Bar(name='Partially Vaccinated', x=continent['date'], y=continent['people_partially_vaccinated'], marker=dict(
color=np.random.randn(500), #set color equal to a variable
colorscale='oranges', # one of plotly colorscales
),hovertemplate ='<br><b>Date</b>: %{x}'+'<br><i>Par. Vac</i>: %{y}'))
for continent in vac_countries:
fig.add_trace(go.Bar(name='Fully Vaccinated', x=continent['date'], y=continent['people_fully_vaccinated'], marker=dict(
color=np.random.randn(500), #set color equal to a variable
colorscale='blues', # one of plotly colorscales
) ,hovertemplate ='<br><b>Date</b>: %{x}'+'<br><i>Fully Vaccinated</i>: %{y}'))
fig.update_layout(updatemenus=[dict(buttons=list([dict(label = 'Asia', method = 'update', args = [{'visible': [True, False, False, False,False,False]},{'title': 'Asia'}]),
dict(label = 'Africa', method = 'update', args = [{'visible': [False,True, False, False, False,False]},{'title': 'Africa'}]),
dict(label = 'Oceania', method = 'update',args = [{'visible': [False,False,True, False, False, False]},{'title': 'Oceania'}]),
dict(label = 'Europe', method = 'update', args = [{'visible': [False,False,False,True, False, False]},{'title': 'Europe'}]),
dict(label = 'North america', method = 'update', args = [{'visible': [False,False,False,False,True, False]},{'title': 'North America'}]),
dict(label = 'South america', method = 'update', args = [{'visible': [False,False,False,False,False,True]},{'title': 'South America'}])
]),
type = "buttons",direction="down",showactive=True,y=1.5,yanchor="top", bgcolor="#ff6600"),])
fig.update_xaxes(showticklabels=True, rangeslider_visible=True,rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
))
fig.update_layout(barmode='stack',title_text="Vaccination Status", title_x=0.5, title_font_size=20, paper_bgcolor="#ffcc99",
xaxis_title="Date", yaxis_title="Number of Doses")
fig.show()